This is an R Markdown document containing code and plots for Abumazen et al. “Nasopharyngeal metatranscriptomics reveals host-pathogen signatures of pediatric sinusitis”.
library(reshape2)
library(pheatmap)
library(pROC)
library(RColorBrewer)
library(dplyr)
library(plyr)
library(vioplot)
library(Metrics)
library(DESeq2)
library(tximport)
library(EnhancedVolcano)
library(patchwork)
library(knitr)
Reading in the kraken2 data
tb = read.delim("d1-d5_fastp_dehost_krakenmerged_reads_normalized.txt",sep='\t',col.names=(c("X","Y","Z")))
matr = dcast(tb,Y ~ X,value.var = "Z")
rownames(matr) = matr[,1]
matr = matr[,-1]
for (i in 1:nrow(matr))
{ matr[i,which(is.na(matr[i,]))] = 0
}
matr = t(data.matrix(matr))
splitrownames = vector(length = nrow(matr))
for (i in 1:length(splitrownames))
{
rownames(matr)[i] = strsplit(rownames(matr),"_")[[i]][1]
}
Reading in the metadata
metadata = read.csv("metadata.12.20.updated.csv",header=T)
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
shared = intersect(rownames(matr),metadata[,1])
matr = matr[match(shared,rownames(matr)),]
metadata = metadata[match(shared,metadata[,1]),]
annotation_row = metadata[,c(22,24,26)]
annotation_row = (annotation_row -2) * -1 #just to make it going 0 to 1
rownames(annotation_row) = metadata[,1]
colnames(annotation_row) = c("SPN", "HFLU", "MCAT")
cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
"YlGnBu")))(99))
ann_colors = list(
SPN = c("white", "light gray"),
HFLU = c("white", "light gray"),
MCAT = c("white", "light gray")
)
pheatmap(log(matr[,c(grep("Moraxella cat", colnames(matr)),grep("Streptococcus pne", colnames(matr)),grep("Haemophilus inf", colnames(matr)))] + 1, 10),annotation_row=annotation_row[,c(2,1,3)],cluster_cols = F, show_rownames = FALSE,col=cols,annotation_colors = ann_colors)
group = metadata[,"mcat"]
group <- mapvalues(group,
from=c("2","1"),
to=c("-","+"))
group = as.factor(group)
abund = log(matr[,"Moraxella catarrhalis"] + 1,10)
p = ggplot(data.frame(mcat = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("mcat") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
group = metadata[,"spn"]
group <- mapvalues(group,
from=c("2","1"),
to=c("-","+"))
group = as.factor(group)
abund = log(matr[,"Streptococcus pneumoniae"] + 1,10)
p = ggplot(data.frame(spn = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("spn") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
group = metadata[,"hflu"]
group <- mapvalues(group,
from=c("2","1"),
to=c("-","+"))
group = as.factor(group)
abund = log(matr[,"Haemophilus influenzae"] + 1,10)
p = ggplot(data.frame(hflu = abund), aes(y = abund, x = group,fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
theme(legend.position = "none") +
ggtitle("hflu") + theme(plot.title = element_text(hjust = 0.5)) +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge())
p
boxplot(log((matr[which(metadata[,"mcat"] == 2),"Moraxella catarrhalis"]) + 1, 10),log((matr[which(metadata[,"mcat"] != 2),"Moraxella catarrhalis"]) + 1, 10),main="MCAT",names=c("-","+"),ylab="log10(RPM)")
boxplot(log((matr[which(metadata[,"spn"] == 2),"Streptococcus pneumoniae"]) + 1,10),log((matr[which(metadata[,"spn"] != 2),"Streptococcus pneumoniae"]) + 1, 10),main="SPN",names=c("-","+"),ylab="log10(RPM)")
boxplot(log((matr[which(metadata[,"hflu"] == 2),"Haemophilus influenzae"]) + 1,10),log((matr[which(metadata[,"hflu"] != 2),"Haemophilus influenzae"]) + 1, 10),main="HFLU",names=c("-","+"),ylab="log10(RPM)")
bacterial_detection_threshold = 1
values = unique(rev(sort(matr[,"Moraxella catarrhalis"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr[,"Moraxella catarrhalis"] >= k)
TPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
FPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2))
}
prediction = matr[,"Moraxella catarrhalis"]
response = metadata[,"mcat"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.823844
auc_mcat = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("mcat","AUC = ",round(auc_mcat,2)))
matches = which(matr[,"Moraxella catarrhalis"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("mcat",tpr0 * 100,100-(fpr0 * 100)))
## [1] "mcat" "93.2203389830508" "52.4271844660194"
sens_mcat = tpr0 * 100
spec_mcat = 100-(fpr0 * 100)
values = unique(rev(sort(matr[,"Streptococcus pneumoniae"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr[,"Streptococcus pneumoniae"] >= k)
TPR[i] = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
FPR[i] = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2))
}
prediction = matr[,"Streptococcus pneumoniae"]
response = metadata[,"spn"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.8915326
auc_spn = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("spn","AUC = ",round(auc_spn,2)))
matches = which(matr[,"Streptococcus pneumoniae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("spn",tpr0 * 100,100-(fpr0 * 100)))
## [1] "spn" "87.1428571428571" "81.4569536423841"
sens_spn = tpr0 * 100
spec_spn = 100-(fpr0 * 100)
values = unique(rev(sort(matr[,"Haemophilus influenzae"])))
TPR = vector(length = length(values))
FPR = vector(length = length(values))
for (i in 1:length(values))
{ k = values[i]
matches = which(matr[,"Haemophilus influenzae"] >= k)
TPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
FPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2))
}
prediction = matr[,"Haemophilus influenzae"]
response = metadata[,"hflu"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.9549669
auc_hflu = auc(response,prediction)
plot(FPR,TPR,type="l",main=paste("hflu","AUC = ",round(auc_hflu,2)))
matches = which(matr[,"Haemophilus influenzae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2))
points(x = fpr0, y = tpr0,cex=2)
# prints out the sensitivity and the specificity given the threshold defined above
print(c("hflu",tpr0 * 100,100-(fpr0 * 100)))
## [1] "hflu" "95.7142857142857" "86.7549668874172"
sens_hflu = tpr0 * 100
spec_hflu = 100-(fpr0 * 100)
accuracies = cbind(sens = c(mcat = sens_mcat,spn = sens_spn,hflu = sens_hflu),spec = c(mcat = spec_mcat,spec_spn,spec_hflu),auc = c(auc_mcat, auc_spn, auc_hflu))
kable(accuracies)
| sens | spec | auc | |
|---|---|---|---|
| mcat | 93.22034 | 52.42718 | 0.8238440 |
| spn | 87.14286 | 81.45695 | 0.8915326 |
| hflu | 95.71429 | 86.75497 | 0.9549669 |
First, we will produce a heatmap of all kraken2-predicted taxa that contain “virus” in their organism names.
allViruses = matr[,grep("virus",colnames(matr))]
pheatmap(t(log(allViruses[,which(apply(allViruses,2,max) >= 10)] + 1,10)),fontsize_col=5)
genesymbols = read.delim("host-expression/gencode.v39.metadata.HGNC")
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
metadata[,"batch"] = as.factor(metadata[,"batch"])
metadata[,"bv_both_none"] = as.factor(metadata[,"bv_both_none"])
metadata[metadata[,"cum_pathogn"] > 1,"cum_pathogn"] = 1
metadata[,"cum_pathogn"] = as.factor(metadata[,"cum_pathogn"])
metadata[,"sex"] = as.factor(metadata[,"sex"])
metadata$age_scaled = scale(metadata$Age.in.years)
onlyvirusonlybacteria_samples = metadata[which((metadata$bv_both_none == 1) | (metadata$bv_both_none == 2)),]
onlyvirusonlybacteria_samples = onlyvirusonlybacteria_samples %>% arrange(cum_pathogn)
subsetfiles <- file.path("host-expression/quants", onlyvirusonlybacteria_samples$Filename, "quant.sf")
subset_txi.salmon <- tximport(subsetfiles, type = "salmon", tx2gene = genesymbols)
dds <- DESeqDataSetFromTximport(subset_txi.salmon, onlyvirusonlybacteria_samples, ~ batch + sex + age_scaled + cum_pathogn)
dds <- DESeq(dds)
res <- results(dds)
summary(res)
##
## out of 31126 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2944, 9.5%
## LFC < 0 (down) : 1891, 6.1%
## outliers [1] : 0, 0%
## low counts [2] : 8333, 27%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
pairwise_genes = which(res$padj < 0.001)
bac_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange > 0))
viral_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange < 0))
There are 548 bacterial upDEGs and 273 viral upDEGs.
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
pCutoff = 0.001,
FCcutoff = 0,
xlim = c(-7, 7),
ylim=c(-1,27),
pointSize = 1.5,
labSize = 2.5,
title = 'DESeq2 results',
subtitle = 'Differential expression',
caption = 'p-value cutoff, 0.001',
legendPosition = "none",
legendLabSize = 14,
col = c('grey30', 'light gray', 'royalblue', 'red2'),
colAlpha = 0.9,
drawConnectors = FALSE,
widthConnectors = 0.5)
Read in full set of files, including those with NO pathogens and BOTH (virus and bacteria)
files <- file.path("host-expression/quants", metadata$Filename, "quant.sf")
txi.salmon <- tximport(files, type = "salmon", tx2gene = genesymbols)
pdds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~ batch + sex + age_scaled + cum_pathogn)
prld <- varianceStabilizingTransformation(pdds, blind = TRUE)
These are plots of host gene expression per clinical group
pathOrNot = as.numeric(metadata[,"bv_both_none"]) # using clinical categories
genelist = c("CXCL10", "PRF1", "IFI27", "CCL8", "PTGS2", "S100A9", "PLAUR", "TNFAIP3","IL1B")
for (i in 1:length(genelist)) {
gene = genelist[i]
print(gene)
p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
geom_boxplot(outlier.shape=NA) +
ggtitle(gene) +
theme(axis.title = element_blank(), legend.position="none") +
labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
labs(fill = "Infection type")
assign(paste0("p", i), p)
}
## [1] "CXCL10"
## [1] "PRF1"
## [1] "IFI27"
## [1] "CCL8"
## [1] "PTGS2"
## [1] "S100A9"
## [1] "PLAUR"
## [1] "TNFAIP3"
## [1] "IL1B"
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + theme(legend.position="right") + plot_layout(nrow = 3)
Setting up data for sorted heatmaps
#setting up a temp matrix with Z-score scaled expression levels
temp = t(scale(t(assay(prld[,]))))
temp[temp < -5] = -5
temp[temp > +5] = +5
temp[is.na(temp)] = 0
annotation_col = data.frame(
Category = metadata$high_pathogens,
Viral_pathogen_abundance = metadata[,"viral_quantile"],
Bacterial_pathogen_abundance = metadata[,"threebac_quantile"],
Symptom_severity_score = metadata[,"PRSS"],
Days_with_cold = metadata[,"days_with_cold"],
Infection = factor(metadata[,"bv_both_none"], labels = c("None","Bacterial", "Viral", "Both"))
)
heatmap <- pheatmap(temp[bac_upDEGs,order(bacPathAbundance)],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "green", "Yes" = "red")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
heatmap1 <- pheatmap(temp[viral_upDEGs,order(metadata$high_pathogen)],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "white", "Yes" = "black")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
heatmap2 <- pheatmap(temp[bac_upDEGs,order(metadata$high_pathogen)],
annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
Resistance = c("No" = "white", "Yes" = "black")),
annotation_col = annotation_col,
show_colnames = F,
show_rownames = F,
border_color=NA,
col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
#col = colorRampPalette(c("navy", "white", "red"))(50),
fontsize_row=6,
cluster_cols = F,
cluster_rows = T,
annotation_legend = TRUE)
viralResponseScore = apply(temp[viral_upDEGs,],2,mean)
bacterialResponseScore = apply(temp[bac_upDEGs,],2,mean)
group = as.numeric(metadata[,"threebac_quantile"])
p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
labs(fill = "Pathogen abundance") +
labs(x = "Bacterial pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") +
theme(legend.position = "none")
p
group = as.numeric(metadata[,"viral_quantile"])
p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
labs(fill = "Pathogen abundance") +
labs(x = "Viral pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)") +
theme(legend.position = "none")
p
group = as.numeric(metadata[,"high_pathogens"])
p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)")
p
group = as.numeric(metadata[,"high_pathogens"])
p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
geom_boxplot(outlier.shape=NA) +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
labs(fill = "Pathogen abundance") +
labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
p
pathOrNot = as.numeric(metadata[,"high_pathogens"]) # using categories "Both low" "Viral high" "Bacterial high" "Both high"
genelist = c("PTGS2", "S100A9", "PLAUR", "TNFAIP3", "SERPINA2", "CXCL2","IL1B", "CXCL11","IFNL1","CXCL10", "PRF1", "IFI27", "CCL8","OAS2")
for (i in 1:length(genelist)) {
gene = genelist[i]
p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
geom_boxplot(outlier.shape=NA) +
ggtitle(gene) +
theme(axis.title = element_blank(), legend.position="none") +
geom_point(pch = 21, position = position_jitterdodge()) +
scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
labs(fill = "Infection type")
assign(paste0("p", i), p)
}
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11 + p12 + p13 + p14 + plot_layout(nrow = 2)
Here, we will compute the correlation between the bacterial host response score and the total bacterial pathogen abundance (MCAT + SPN + HFLU), and compare this to 1000 randomly samples selections of three bacterial species from the data.
abundant_bacteria = setdiff(which(apply(matr,2,max) >= 10),grep("virus",colnames(matr)))
cors = vector(length = 1000)
for (i in 1:1000)
{
randSetOfThree = sample(abundant_bacteria,3)
threeBacAbundance = apply(matr[,randSetOfThree],1,sum) # sum of their individual RPMs
#ntile(threeBacAbundance, 10)
cors[i] = cor(bacterialResponseScore,log(threeBacAbundance + 1,10))
}
hist(cors,xlim=c(-1,1),breaks=100)
abline(v = cor(bacterialResponseScore, log(bacPathAbundance + 1,10)))
Viral ROC
response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 1 | metadata$high_pathogen == 3)] = 1
prediction = viralResponseScore
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.8752688
Bacterial ROC
response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 2 | metadata$high_pathogen == 3)] = 1
prediction = bacterialResponseScore
bacPlot = plot.roc(roc(response,prediction))
#coords(bacPlot) #for a list of sensitivities and specificities
auc(response,prediction)
## [1] 0.7663192